{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "85ad8c61-1742-45bd-aa54-c6c26d3cb74f",
   "metadata": {},
   "source": [
    "# Embedded Tables Pack\n",
    "\n",
    "This LlamaPack provides an example of our embedded-tables pack (with recursive retrieval + Unstructured.io)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20f56f5a-4e7c-45a0-bf00-10454de8e6a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install llama-index llama-hub unstructured==0.10.18 lxml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6b72b41c-369e-4917-9979-c4160811d54e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4817daa5-9722-4837-add9-faa648aae14a",
   "metadata": {},
   "source": [
    "### Setup Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "539812e0-6ffb-45c4-8157-0f7e38d3c5dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget \"https://www.dropbox.com/scl/fi/mlaymdy1ni1ovyeykhhuk/tesla_2021_10k.htm?rlkey=qf9k4zn0ejrbm716j0gg7r802&dl=1\" -O tesla_2021_10k.htm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "905c6c77-91bc-4ed4-ac82-481bf221c603",
   "metadata": {},
   "source": [
    "### Download and Initialize Pack\n",
    "\n",
    "Note that this pack directly takes in the html file, no need to load it beforehand."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9b929db0-a482-44b3-9bbe-3113f7ca91ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.llama_pack import download_llama_pack\n",
    "\n",
    "EmbeddedTablesUnstructuredRetrieverPack = download_llama_pack(\n",
    "    \"EmbeddedTablesUnstructuredRetrieverPack\",\n",
    "    \"./embedded_tables_unstructured_pack\",\n",
    "    # leave the below commented out (was for testing purposes)\n",
    "    # llama_hub_url=\"https://raw.githubusercontent.com/run-llama/llama-hub/jerry/add_llama_packs/llama_hub\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e5cbb063-2076-43ad-9a21-4a8fc2e005ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# TMP\n",
    "from llama_hub.llama_packs.recursive_retriever.embedded_tables_unstructured.base import (\n",
    "    EmbeddedTablesUnstructuredRetrieverPack,\n",
    ")\n",
    "\n",
    "embedded_tables_unstructured_pack = EmbeddedTablesUnstructuredRetrieverPack(\n",
    "    \"tesla_2021_10k.htm\", nodes_save_path=\"2021_nodes.pkl\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8e2fcd8-c836-492a-8172-9969d37fab6a",
   "metadata": {},
   "source": [
    "### Run Pack"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "83f6347e-0b4b-4284-89e9-13c16ebd11ae",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1;3;34mRetrieving with query id None: What was the revenue in 2020?\n",
      "\u001b[0m\u001b[1;3;38;5;200mRetrieved node with id, entering: id_478_table\n",
      "\u001b[0m\u001b[1;3;34mRetrieving with query id id_478_table: What was the revenue in 2020?\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "# this will run the full pack\n",
    "response = embedded_tables_unstructured_pack.run(\"What was the revenue in 2020?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8bee7838-5852-4033-9df0-bb5037f05b20",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The revenue in 2020 was $31,536 million.\n"
     ]
    }
   ],
   "source": [
    "print(str(response))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c3cd0cd9-632c-4dd5-9a29-c7ad40e82bcd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(response.source_nodes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b1d89616-97a7-452e-92e2-a4c18fe569f3",
   "metadata": {},
   "source": [
    "### Inspect Modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b2bd5b4a-8d8d-4b35-b0bc-47e69cfd7845",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'node_parser': UnstructuredElementNodeParser(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x1270b7100>, llm=None, summary_query_str='What is this table about? Give a very concise summary (imagine you are adding a caption), and also output whether or not the table should be kept.'),\n",
       " 'recursive_retriever': <llama_index.retrievers.recursive_retriever.RecursiveRetriever at 0x1270b7a00>,\n",
       " 'query_engine': <llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x2b2830400>}"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "modules = embedded_tables_unstructured_pack.get_modules()\n",
    "display(modules)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1df6183c-5629-442d-89c2-9353fb5dd8a9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama_hub",
   "language": "python",
   "name": "llama_hub"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
